/* 
    Purpose: Using cleaned 1936 Consumption Survey data,
             this file creates income scores at the 
             occupation-only, occupation x race, and 
             occupation x race x south levels of variation. 
             Income scores are then merged together to 
             create one output file.
	
	  Note: Light cleaning of the data occurs prior to
		    construction of the income scores.

    Creates: ConsumptionSurvey_1936_foranalysis.dta
    			   ConsumptionSurvey_1936_IncomeScores_mainvariations.dta
*/

clear 
set more off

cd "$Mydirectory1/1_DataSources/ConsumptionSurvey_1936/"	

*-----------------------------------------------------------------------------*
*-----------------------------------------------------------------------------*

*************************************
*** IMPORT AND CLEAN DATA
*************************************

use "./RawData/pooled_data_w_occ_codes.dta", clear 

	keep husband_age tot_fam_inc south occ1_harm white mem1_wagetot mem1_wagetot0 fam_wagetot fam_wagetot0 totkids_3

* Check that age is 30-50
	tab husband_age
	rename husband_age age
	
* Income variable
	sum tot_fam_inc, d

/*
   Convert income variables to 1950 dollars using the CPI: 
   Source: https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-)
*/
	gen CPI1935 = 13.8 //13.7 for 1935 and 13.9 for 1936
	gen CPI1950= 24.1
		
	foreach var of varlist tot_fam_inc {
		replace `var' = `var' * (CPI1950 / CPI1935)
	}
	drop CPI*
	
* Region of residence
	tab south
	rename south south_merge
	
* Race
	tab white, m
	gen race=.
	replace race = 1 if white==1
	replace race=2 if white==0
	tab race white, m

	keep if race==1 | race==2 //keep black and white respondents
	drop white

* Occupation
	tab occ1_harm
	rename occ1_harm occ1950ej

* Weight---note: no survey weights 
	gen weight =1
	
/* Drop observations with coarsened occupation 
   outside the range of interest */
	drop if occ1950ej>81 
	
* Drop observations w/ missing or zero income
	drop if tot_fam_inc==. | tot_fam_inc==0
	
	gen number=1
	
	tempfile fulldata 
	save `fulldata'
	
*-----------------------------------------------------------------------------*
*-----------------------------------------------------------------------------*

*************************
*** SAVE MICRODATA
*************************

preserve

	label var age "Husband age"
	label var tot_fam_inc "Total family income"
	label var occ1950ej "Coarsened occupation"
	label var south_merge "Southern region"
	label var race "Race"
	label var weight "Survey weight (one)"
	
	gen agesq = age*age
	
	compress 
	save ./output/ConsumptionSurvey_1936_foranalysis.dta, replace
	
restore

*-----------------------------------------------------------------------------*
*-----------------------------------------------------------------------------*

****************
*** TEMPLATES 
****************

* Template 1: occupation x race
	preserve
	
		collapse (min) race, by(occ1950ej)
		
		expand 2
		bysort occ1950ej: replace race=2 if _n==2
		
		tempfile template0
		save `template0'

* Template 2: occupation x race x south
		gen south_merge =1
		expand 2
		bysort occ1950ej race: replace south_merge=0 if _n==2
		
		tempfile template
		save `template'
		
	restore

*--------------------------------------------------------------*
*--------------------------------------------------------------*

***********************************
*** COLLAPSE (OCCUPATION ONLY)
***********************************
/*Note: No imputations necessary.*/

	use `fulldata', clear 
			
	collapse (mean) tot_fam_inc, by(occ1950ej)
	
	rename tot_fam_inc avg_totfaminc_1936_byocc
	label var avg_totfaminc_1936_byocc "Average income in occ. cell (1950 dollars)"
	
	tempfile inc_byocc
	save `inc_byocc'
	
*--------------------------------------------------------------*
*--------------------------------------------------------------*

************************************
*** COLLAPSE (OCCUPATION X RACE)
************************************

	use `fulldata', clear 
			
	collapse (rawsum) number (mean) tot_fam_inc, by(occ1950ej race)
	
	tempfile incscore
	save `incscore'
	
* Merge into occupation x race template
	use `template0'
	merge 1:1 occ1950ej race using `incscore'
	
* Imputations
	tab race if _merge==1 //imputations needed when race==2
	
	gen tag = _merge==3
	sort occ1950ej race 
	
	by occ1950ej: egen occ_found = sum(tag)
	
	local threshold "3"
	by occ1950ej: gen ratio = tot_fam_inc[_n-1] / tot_fam_inc[_n] if occ_found==2 & number>`threshold' & number[_n-1]>`threshold'
	sum ratio [aw=number], d //calculate average racial income gap across occupations
	local ratio2 = r(mean)
	display `ratio2'
	
	by occ1950ej: replace tot_fam_inc = tot_fam_inc[_n-1] / `ratio2' if tot_fam_inc==.
	
	drop _merge ratio* occ_found tag
	
	rename tot_fam_inc avg_totfaminc_1936_byocc_byr
	label var avg_totfaminc_1936_byocc_byr "Average income in occ-race cell (1950 dollars)"
	
	tempfile inc_byocc_byr
	save `inc_byocc_byr'

	
*--------------------------------------------------------------*
*--------------------------------------------------------------*

********************************************
*** COLLAPSE (OCCUPATION X RACE X SOUTH) 
********************************************
	
foreach j in og altwt {

	if "`j'"=="og" local cond " "
	if "`j'"=="altwt" local cond "[aw=totkids_3]"

	use `fulldata', clear 
	
	collapse (rawsum) number (mean) tot_fam_inc `cond', by(occ1950ej race south_merge)
	
	tempfile incscore_`j'
	save `incscore_`j''

* Merge income scores into occ x race x south template
	use `template'

	merge 1:1 occ1950ej race south_merge using `incscore_`j''
	assert _merge!=2
	
	tab occ1950ej race if _merge==1
	gen flag_impute_1936 = _merge==1 
	drop _merge
	
	
* Imputations (white individuals)
	
  /*Method: Calculate average regional income gap 
            (for white individuals) in a similar 
            occupation. Then rescale non-south,
            white income by average regional
            income gap.

            -Use semi-professionals (occ=18) to 
            impute for social workers (occ=9).
            -Use bookkeepers (occ=30) to 
            impute for typists/stenographers (occ=31).
            -Use inside sales (occ=35) to 
            impute for outside sales (occ=36).
 */
	tab south_merge if race==1 & flag_impute_1936==1
	levels occ1950ej if race==1 & south_merge==1 & flag_impute_1936==1, local(occs1)
	
	foreach miss1 in `occs1' {
	
		if "`miss1'"=="9" local fill1 "18" 
		if "`miss1'"=="31" local fill1 "30" 
		if "`miss1'"=="36" local fill1 "35" 
	
		foreach x in 0 1 {
			sum tot_fam_inc if south_merge==`x' & race==1 & occ1950ej==`fill1'
			local numb`x' = `r(mean)'
		}
		
		/* Calculate average regional income gap in similar occupation
		   (white respondents) */
		local ratio = `numb0'/`numb1'
		display `ratio'
		
    	/*Missing income for white, southern respondents: 
    	  rescale non-south white income by average regional
    	  income gap, by occupation */
		sum tot_fam_inc if south_merge==0 & race==1 & occ1950ej==`miss1'
		replace tot_fam_inc = `r(mean)' / `ratio' if south_merge==1 & race==1 & occ1950ej==`miss1'
		
	}
	
	assert tot_fam_inc!=. if race==1
	
* Imputations (black individuals)
	
	sort occ1950ej south_merge race
	by occ1950ej south_merge: gen white_black_ratio = tot_fam_inc[_n-1]/tot_fam_inc[_n] if number[_n-1]>=`threshold' & number>`threshold' 
	
	// 1. Non-South
    /*Calculate average racial income gap in 
      Non-South across occupation cells */
	quietly sum white_black_ratio if south_merge==0 [aw=number] 
	local ratio1 = r(mean)
	display `ratio1'
	
	levelsof occ1950ej if race==2 & south_merge==0 & number==., local(occs1)
	
    /*Missing income for black respondents: rescale white income by 
      average racial income gap in nonsouth, by occupation */
	foreach y in `occs1' {
		display "occupation: `y'"
		
		sum tot_fam_inc if south_merge==0 & race==1 & occ1950ej==`y'
		replace tot_fam_inc = `r(mean)' / `ratio1' if south_merge==0 & race==2 & occ1950ej==`y' 	
	}
	
	// 2. South
    /*Calculate average racial income gap in 
      South across occupation cells */	
	quietly sum white_black_ratio if south_merge==1 [aw=number] 
	local ratio2 = r(mean)
	display `ratio2'
		
	levelsof occ1950ej if race==2 & south_merge==1 & number==., local(occs1)

    /*Missing income for black respondents: rescale white income by 
      average racial income gap in South, by occupation */
	foreach y in `occs1' {
		display "occupation: `y'"
		
		sum tot_fam_inc if south_merge==1 & race==1 & occ1950ej==`y'
		replace tot_fam_inc = `r(mean)' / `ratio2' if south_merge==1 & race==2 & occ1950ej==`y' 
	}
	
	assert tot_fam_inc!=.
	replace number=0 if number==.

	rename number number_obs_cell_1936
	label var number "Number of observations used to get average income (occ ,race, south)"
	label var flag_impute_1936 "Flag: income imputed (occ ,race, south)"
	rename tot_fam_inc avg_totfaminc_1936
	label var avg_totfaminc_1936 "Average income in occ-race-south cell (1950 dollars)"
	
	if "`j'"=="altwt" {
		keep occ1950ej race south_merge avg_totfaminc_1936
		rename avg_totfaminc_1936 avg_totfaminc_1936_altwt
	}
	
	tempfile inc_byocc_byr_bys_`j'
	save `inc_byocc_byr_bys_`j''
	
}
	
*--------------------------------------------------------------*
*--------------------------------------------------------------*

******************	
*** MERGE
******************	

	use `inc_byocc_byr_bys_og', clear
	
	merge m:1 occ1950ej race south_merge using `inc_byocc_byr_bys_altwt', assert(3) nogen
	merge m:1 occ1950ej race using `inc_byocc_byr', keep(3) assert(3) nogen
	merge m:1 occ1950ej using `inc_byocc', keep(3) assert(3) nogen
	
**********************************	
*** SAVE (OCC X RACE X SOUTH)
**********************************	

	keep occ1950ej race south_merge number_obs_cell_1936 avg_totfaminc_1936* flag_impute_1936 avg_totfaminc_1936_altwt
	
	compress	
	save ./output/ConsumptionSurvey_1936_IncomeScores_mainvariations.dta, replace 
